library(tidyverse)

twoset <- read_delim('~/append-ssd/nextflowing/downstreaming/public_crc/crc_2sets_all.vcf.gz',
           comment = '##')

twoset |> filter(ID == 'rs1050501') |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11)

read_delim('~/append-ssd/nextflowing/rnavar-crc-guo2021sided/pipeline_info/samplesheet.valid.csv')

# zhang2020 ESCC ------------
## impute5 -------
zhang20escc1.3 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/zhang2020/i2t.3mb.imp5.vcf', comment = '##')

zhang20escc1.3 <- zhang20escc1.3 |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  mutate(aa_type = case_when(
    str_detect(genotype, '^0.0') ~ 'II',
    str_detect(genotype, '^1.1') ~ 'TT',
    .default = 'IT'
  ))

zhang20escc1.5 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/zhang2020/i2t.5mb.imp5.vcf', comment = '##')

zhang20escc1.5 <- zhang20escc1.5 |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  mutate(aa_type = case_when(
    str_detect(genotype, '^0.0') ~ 'II',
    str_detect(genotype, '^1.1') ~ 'TT',
    .default = 'IT'
  ))

zhang20escc1.3[,-1] |>
  left_join(zhang20escc1.5[,-1],by = 'sample') |>
  filter(aa_type.x != aa_type.y)

zhang20escc1.20 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/zhang2020/i2t.20mb.imp5.vcf', comment = '##')

zhang20escc1.20 <- zhang20escc1.20 |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  mutate(aa_type = case_when(
    str_detect(genotype, '^0.0') ~ 'II',
    str_detect(genotype, '^1.1') ~ 'TT',
    .default = 'IT'
  ))

zhang20escc1.3[,-1] |>
  left_join(zhang20escc1.20[,-1],by = 'sample') |>
  filter(aa_type.x != aa_type.y)

### chr14 g2r
zhang20escc14 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/zhang2020/zhang2020.chr14.imputed.vcf.gz', comment = '##')

zhang20escc14 |> filter(POS == 105737776) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('zhang20escc_gr.csv')

## glimpse2 -------

## beagle v5 ----------

## minimac4 ---------


# berlin 2023 CRC ------
berlin23.chr1 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/berlin2023/berlin23.chr1.imputed.vcf.gz', comment = '##')

berlin23.chr1 |> filter(POS == 161674008) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('berlin23_it.csv')

berlin23.chr14 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/berlin2023/berlin23.chr14.imputed.vcf.gz', comment = '##')

berlin23.chr14 |> filter(POS == 105737776) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('berlin23_gr.csv')

be23.gli2.chr14 <-
  read_delim('~/append-ssd/glimpse-bio/berlin23.chr14.glimpse2.vcf.gz', comment = '##')

be23.gli2.chr14 |> filter(POS == 105737776) |>
  pull(INFO)

be23.gli2.chr14 |> filter(POS == 105737776) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('berlin23_gr.csv')

### minimac4
read_delim('~/learn/minimac4-conda/Minimac4/build/berlin23.chr14.minimac4.vcf.gz', comment = '##')

# walker 2021 EAC --------
## FCGR2B I232T
### impute5
walker.imp5.chr1 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.chr1.impute5.vcf.gz', comment = '##')

walker.imp5.chr1 |> 
  filter(POS == 161674008) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype')

walker.imp5.chr1 |> 
  filter(POS == 161674008) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('walker21_imp5_it.csv')

### glimpse2
walker.gli2.chr1 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.chr1.160m.glimpse2.vcf.gz', comment = '##')

walker.gli2.chr1 |> 
  filter(POS == 161674008) |>
  pull(INFO)

walker.gli2.chr1 |> 
  filter(POS == 161674008) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('walker21_gli2_it.csv')

### beagle v5
walker.bgl.chr1 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.beagle.i2t.vcf', comment = '##')

walker.bgl.chr1 |> 
  filter(POS == 161674008) |>
  pull(INFO)

walker.bgl.chr1 |> 
  filter(POS == 161674008) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  mutate(aa_type = case_when(
    str_detect(genotype, '^0.0') ~ 'II',
    str_detect(genotype, '^1.1') ~ 'TT',
    .default = 'IT'
  )) |>
  write_csv('walker21_bgl_it.csv')



## IGHG1 G396R
### impute5
walker.imp5.chr14 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.chr14.impute5.vcf.gz', comment = '##')

walker.imp5.chr14 |>
  filter(POS == 105737776) |>
  pull(INFO)

walker.imp5.chr14 |>
  filter(POS == 105737776) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('walker21_imp5.gr.csv')

### glimpse2
walker.gli2.chr14 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.chr14.104m.glimpse2.vcf.gz', comment = '##')

walker.gli2.chr14 |>
  filter(POS == 105737776) |>
  pull(INFO)

walker.gli2.chr14 |>
  filter(POS == 105737776) |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  write_csv('walker21_gli2.gr.csv')

### beagle v5
walker.bgl.chr14 <- read_delim('~/append-ssd/nextflowing/downstreaming/pub_escc/walker21eac/walker21.beagle.g2r.vcf', comment = '##')

walker.bgl.chr14 |>
  pull(INFO)

walker.bgl.chr14 |>
  pivot_longer(10:last_col(), names_to = 'sample', values_to = 'genotype') |>
  select(9:11) |>
  mutate(aa_type = case_when(
    str_detect(genotype, '^0.0') ~ 'GG',
    str_detect(genotype, '^1.1') ~ 'RR',
    .default = 'GR'
  )) |>
  write_csv('walker21_bgl_gr.csv')
